# This file contains the analysis with uncertainty

rm(list=ls())
library(xtable)
library("arm")
library("foreign")
library("lme4")

setwd(".../Final replication file")


#################################################
#### Analysis for State Senate Districts ########
#################################################


# District id variable
districts<-read.csv(paste("districts_ssd.csv",sep=""))
district.uid<-districts$ssd_district_uid


### Prepare survey data for response model with state senate variables 
data1<-read.dta("gay_marriage_senate.dta")
state.gaymarriage<-data1$state_gaymarriage
race2<-data1$race
cst2<-factor(data1$cst)
#survey2<-factor(data1$survey)
education5<-factor(data1$education)
factor.education<-factor(data1$education5)
region<-data1$region_num
districturban.full<-data1$ssd_district_urban
districtincome.full<-data1$ssd_district_income/100000
districtveteran.full<-data1$ssd_district_veteran
statereligion.full<-data1$state_religion
stateunion.full<-data1$state_union
districtgay.full<-data1$ssd_district_gay
district_num<-data1$ssd_district_uid
gender<-data1$gender

# adding age as predictor
age <- data1$age_exact
age.group <- rep(0,length(age))
age.group[age<20] <- 0
age.group[age>19&age<25] <- 1
age.group[age>24&age<30] <- 2
age.group[age>29&age<35] <- 3
age.group[age>34&age<40] <- 4
age.group[age>39&age<45] <- 5
age.group[age>44&age<50] <- 6
age.group[age>49&age<55] <- 7
age.group[age>54&age<60] <- 8
age.group[age>59&age<65] <- 9
age.group[age>64&age<70] <- 10
age.group[age>69&age<75] <- 11
age.group[age>74&age<80] <- 12
age.group[age>79&age<85] <- 13
age.group[age>84&age<90] <- 14
age.group[age>89] <- 15



##############################################################################################################
##### Post-Stratification Step
new.data <- read.dta("DATA_v4_ageshares.dta")
new.data1 <- new.data[,c(47,53:67)] 
colnames(new.data1)
age.multiplier <- c(t(as.matrix(new.data1[,-1])))

## Survey data

DATAmodel <- data.frame(cbind(age.group, state.gaymarriage, race2, gender, cst2, district_num, education5, districtincome.full, districturban.full, districtveteran.full, statereligion.full, stateunion.full, region, districtgay.full))


# generating sample information:
# race (2-3-1-4)- gender (0-1) - education (1:5) - AGE.GROUP (1:15)
table(DATAmodel$race2)
table(DATAmodel$gender)
table(DATAmodel$education5)
DATAmodel$age.group
AGE <- factor(DATAmodel$age.group, levels = c(0:15))
Survey.Age.Info <- matrix(NA,40,16)
Survey.Age.Info[1,] <- table(AGE[DATAmodel$race2==2 & DATAmodel$gender==0 & DATAmodel$education5==1])
Survey.Age.Info[2,] <- table(AGE[DATAmodel$race2==2 & DATAmodel$gender==0 & DATAmodel$education5==2])
Survey.Age.Info[3,] <- table(AGE[DATAmodel$race2==2 & DATAmodel$gender==0 & DATAmodel$education5==3])
Survey.Age.Info[4,] <- table(AGE[DATAmodel$race2==2 & DATAmodel$gender==0 & DATAmodel$education5==4])
Survey.Age.Info[5,] <- table(AGE[DATAmodel$race2==2 & DATAmodel$gender==0 & DATAmodel$education5==5])
Survey.Age.Info[6,] <- table(AGE[DATAmodel$race2==2 & DATAmodel$gender==1 & DATAmodel$education5==1])
Survey.Age.Info[7,] <- table(AGE[DATAmodel$race2==2 & DATAmodel$gender==1 & DATAmodel$education5==2])
Survey.Age.Info[8,] <- table(AGE[DATAmodel$race2==2 & DATAmodel$gender==1 & DATAmodel$education5==3])
Survey.Age.Info[9,] <- table(AGE[DATAmodel$race2==2 & DATAmodel$gender==1 & DATAmodel$education5==4])
Survey.Age.Info[10,] <- table(AGE[DATAmodel$race2==2 & DATAmodel$gender==1 & DATAmodel$education5==5])
Survey.Age.Info[11,] <- table(AGE[DATAmodel$race2==3 & DATAmodel$gender==0 & DATAmodel$education5==1])
Survey.Age.Info[12,] <- table(AGE[DATAmodel$race2==3 & DATAmodel$gender==0 & DATAmodel$education5==2])
Survey.Age.Info[13,] <- table(AGE[DATAmodel$race2==3 & DATAmodel$gender==0 & DATAmodel$education5==3])
Survey.Age.Info[14,] <- table(AGE[DATAmodel$race2==3 & DATAmodel$gender==0 & DATAmodel$education5==4])
Survey.Age.Info[15,] <- table(AGE[DATAmodel$race2==3 & DATAmodel$gender==0 & DATAmodel$education5==5])
Survey.Age.Info[16,] <- table(AGE[DATAmodel$race2==3 & DATAmodel$gender==1 & DATAmodel$education5==1])
Survey.Age.Info[17,] <- table(AGE[DATAmodel$race2==3 & DATAmodel$gender==1 & DATAmodel$education5==2])
Survey.Age.Info[18,] <- table(AGE[DATAmodel$race2==3 & DATAmodel$gender==1 & DATAmodel$education5==3])
Survey.Age.Info[19,] <- table(AGE[DATAmodel$race2==3 & DATAmodel$gender==1 & DATAmodel$education5==4])
Survey.Age.Info[20,] <- table(AGE[DATAmodel$race2==3 & DATAmodel$gender==1 & DATAmodel$education5==5])
Survey.Age.Info[21,] <- table(AGE[DATAmodel$race2==1 & DATAmodel$gender==0 & DATAmodel$education5==1])
Survey.Age.Info[22,] <- table(AGE[DATAmodel$race2==1 & DATAmodel$gender==0 & DATAmodel$education5==2])
Survey.Age.Info[23,] <- table(AGE[DATAmodel$race2==1 & DATAmodel$gender==0 & DATAmodel$education5==3])
Survey.Age.Info[24,] <- table(AGE[DATAmodel$race2==1 & DATAmodel$gender==0 & DATAmodel$education5==4])
Survey.Age.Info[25,] <- table(AGE[DATAmodel$race2==1 & DATAmodel$gender==0 & DATAmodel$education5==5])
Survey.Age.Info[26,] <- table(AGE[DATAmodel$race2==1 & DATAmodel$gender==1 & DATAmodel$education5==1])
Survey.Age.Info[27,] <- table(AGE[DATAmodel$race2==1 & DATAmodel$gender==1 & DATAmodel$education5==2])
Survey.Age.Info[28,] <- table(AGE[DATAmodel$race2==1 & DATAmodel$gender==1 & DATAmodel$education5==3])
Survey.Age.Info[29,] <- table(AGE[DATAmodel$race2==1 & DATAmodel$gender==1 & DATAmodel$education5==4])
Survey.Age.Info[30,] <- table(AGE[DATAmodel$race2==1 & DATAmodel$gender==1 & DATAmodel$education5==5])
Survey.Age.Info[31,] <- table(AGE[DATAmodel$race2==4 & DATAmodel$gender==0 & DATAmodel$education5==1])
Survey.Age.Info[32,] <- table(AGE[DATAmodel$race2==4 & DATAmodel$gender==0 & DATAmodel$education5==2])
Survey.Age.Info[33,] <- table(AGE[DATAmodel$race2==4 & DATAmodel$gender==0 & DATAmodel$education5==3])
Survey.Age.Info[34,] <- table(AGE[DATAmodel$race2==4 & DATAmodel$gender==0 & DATAmodel$education5==4])
Survey.Age.Info[35,] <- table(AGE[DATAmodel$race2==4 & DATAmodel$gender==0 & DATAmodel$education5==5])
Survey.Age.Info[36,] <- table(AGE[DATAmodel$race2==4 & DATAmodel$gender==1 & DATAmodel$education5==1])
Survey.Age.Info[37,] <- table(AGE[DATAmodel$race2==4 & DATAmodel$gender==1 & DATAmodel$education5==2])
Survey.Age.Info[38,] <- table(AGE[DATAmodel$race2==4 & DATAmodel$gender==1 & DATAmodel$education5==3])
Survey.Age.Info[39,] <- table(AGE[DATAmodel$race2==4 & DATAmodel$gender==1 & DATAmodel$education5==4])
Survey.Age.Info[40,] <- table(AGE[DATAmodel$race2==4 & DATAmodel$gender==1 & DATAmodel$education5==5])

colnames(Survey.Age.Info) <- c("<20", "20--24","25--29", "30--34", "35--39", "40--44", "45--49", "50--54", "55--59", "60--64","65--69","70--74","75--79","80--84","85--90",">90")
print(xtable(Survey.Age.Info,caption = "Table 9 in Appendix"),include.rownames=FALSE)

Survey.Age.frequency <- colSums(Survey.Age.Info[,-1])/sum(Survey.Age.Info[,-1])
tabL <- matrix(NA,16,3)
tabL[2:16,1] <- round(Survey.Age.frequency,3)
tabL[2:16,2] <- unlist(round(new.data1[1,-1],3))
tabL[2:16,3] <- round(tabL[2:16,2]/tabL[2:16,1],3)
rownames(tabL) <- c("","<20","20-24","25-29","30-34","35-39","40-44","45-49","50-54","55-59","60-64","65-69","70-74","75-79","80-84",">84")

xtable(tabL,caption = "Table 8 in Appendix")
# Read in census data without age
### Prepare data for predictions with state senate data (n=77680, which is 2*4*5 * 1942) 
PS.Data<-read.csv(paste("Post-Stratification Data - RImport (ssd).csv",sep=""))

# CHANGE TO NEW SET-UP; 2 * 4 * 5 * 15 * 1942
# race - gender - education
N.sim <- 100			# simulation number for synthetic uncertainty
# now, I need to change the age category
b <- rep(15,dim(PS.Data)[1])
PS.Data1 <- PS.Data[rep(1:nrow(PS.Data), b), ]
age.category <- rep(c(1:15),77680)
PS.Data1 <- cbind(PS.Data1,age.category)
PS.Data.n <- matrix(NA,1165200,N.sim)

for (t in 1:N.sim){
	PS.Data.n[,t] <- PS.Data1[,6]
	}

p.vec <- matrix(NA,40,15)
DRAWS <- array(NA,c(100,15,40))
#Big.Age.district.percent <- matrix(NA,1165200,N.sim)
for (a in 1:1942){
		a1 <- (a-1)*600 + 1	
		a2 <- a*600
		district.census.Age.frequency <- new.data1[a,-1]
		district.corrector <- district.census.Age.frequency/ Survey.Age.frequency
		Age.district <- Survey.Age.Info[,-1] %*% diag(district.corrector)
		Age.district.percent <- Age.district/rowSums(Age.district)
			for (c in 1:40){
				p.vec[c,] <- round(Age.district)[c,]/sum(round(Age.district)[c,])	
				draws <- rmultinom(N.sim,size=sum(round(Age.district)[c,]),prob=p.vec[c,])
				DRAWS[,,c] <- t(draws/colSums(draws))		
			}
			for (d in 1:N.sim){
					Big.Age.district.percent <- c(DRAWS[d,,])		
					PS.Data.n[c(a1:a2),d] <- PS.Data.n[c(a1:a2),d] * Big.Age.district.percent
			}
		print(a)
}


###########	
# Prepare data for disaggregation (calculate mean for each senate district)
sample.mean.gaymarriage<-tapply(state.gaymarriage, data1$ssd_district_uid, mean)
sample.count.gaymarriage<-tapply(state.gaymarriage, data1$ssd_district_uid, length)
sample.mean.gaymarriage.names<-as.numeric(rownames(sample.mean.gaymarriage))
sample.mean.gaymarriage.adj1<-cbind(sample.mean.gaymarriage, sample.mean.gaymarriage.names)
sample.mean.gaymarriage.adj2<-data.frame(matrix(NA, nrow=1942, ncol=4))

sample.count.gaymarriage.names<-as.numeric(rownames(sample.count.gaymarriage))
sample.count.gaymarriage.adj1<-cbind(sample.count.gaymarriage, sample.count.gaymarriage.names)
# 1 senate district is missing
	
## Create a vector of district effects without any missing cells (zeros where no data for a district)
for (i in 1:length(unique(district.uid)))
{
sample.mean.gaymarriage.adj2[i,1]<-sum(sample.mean.gaymarriage.adj1[sample.mean.gaymarriage.names==i,1], na.rm=FALSE)
sample.mean.gaymarriage.adj2[i,2]<-sum(sample.count.gaymarriage.adj1[sample.count.gaymarriage.names==i,1], na.rm=FALSE)
}

sample.mean.gaymarriage.adj2[,3]<-districts$cst

sample.mean.gaymarriage.adj2[,4]<-sample.mean.gaymarriage.adj2[,1]

for (i in 1:length(unique(district.uid))) {
				if (sample.mean.gaymarriage.adj2[i,2]==0){
									sample.mean.gaymarriage.adj2[i,4]<-NA
									}
				}
sample.mean.gaymarriage<-sample.mean.gaymarriage.adj2[,4]

	
## Marriage response model

marriage.model.wAGE <-  glmer(formula = state.gaymarriage ~ (1 | age.group) + (1 | race2) + gender + (1 | cst2) + (1 | district_num) +  (1 | education5)+ districtincome.full + districturban.full + districtveteran.full + statereligion.full + stateunion.full + (1 | region) + districtgay.full, family=binomial(link="logit"), control = glmerControl(optimizer = "bobyqa"))

ps_cregion<-PS.Data1$ps_cregion
cst_ps<-PS.Data1$cst_ps
cat_educ<-PS.Data1$cat_educ
cat_race<-PS.Data1$cat_race
cat_female<-PS.Data1$cat_female
ps_district_income <-PS.Data1$ps_district_income/100
ps_district_urban<-PS.Data1$ps_district_urban
ps_district_veteran<-PS.Data1$ps_district_veteran
ps_districtgay<-PS.Data1$ps_district_gay
ps_state_religion<-PS.Data1$ps_state_religion
ps_state_union<-PS.Data1$ps_state_union
ps_district_uid<-PS.Data1$ssd_district_uid
cat_age<-PS.Data1$age.category


# Get RE for each district (1 is missing)
district.effects<-ranef(marriage.model.wAGE)$district_num
district.effects.names<-as.numeric(rownames(district.effects))
district.effects2<-cbind(district.effects, district.effects.names)

district.effects3<-data.frame(matrix(NA, nrow=1942, ncol=1))

# Create a vectors of district effects without any missing cells (zeros where no data for a district)
for (i in 1:length(unique(district.uid))){
	j<-"i"
	district.effects3[i,1]<-sum(district.effects2[district.effects2$district.effects.names==i,1], na.rm=TRUE)
	#print(sum(district.effects2[district.effects2$district.effects.names==i,1], na.rm=TRUE))
}


# Get RE for each state 
state.effects<-ranef(marriage.model.wAGE)$cst2
state.effects.names<-rownames(state.effects)
state.effects2<-cbind(state.effects, state.effects.names)
state.effects3<-data.frame(matrix(NA, nrow=51, ncol=2))
states<-unique(cst_ps)
state.effects3[,2]<-states

state.effects3[1:48,1]<-state.effects2[,1]
states2<-factor(states)

## Create a vectors of district effects without any missing cells (zeros where no data for a district)
for (i in 1:length(states)){
	state.effects3[i,1]<-sum(state.effects2[state.effects.names==states[i],1], na.rm=TRUE)
}
state.effects3[50,2]<-"AK"
state.effects3[51,2]<-"HI"
	

# Make predictions
cell.pred <-invlogit(fixef(marriage.model.wAGE)["(Intercept)"]+ ranef(marriage.model.wAGE)$age.group[cat_age,1] +ranef(marriage.model.wAGE)$education5[cat_educ,1] + ranef(marriage.model.wAGE)$race2[cat_race,1] +   fixef(marriage.model.wAGE)["gender"] *cat_female +    district.effects3[ps_district_uid,1] + state.effects3[cst_ps,1]  +  ranef(marriage.model.wAGE)$region[ps_cregion,1]  + fixef(marriage.model.wAGE)["districtincome.full"]*ps_district_income + fixef(marriage.model.wAGE)["districturban.full"]*ps_district_urban + fixef(marriage.model.wAGE)["districtveteran.full"]*ps_district_veteran+fixef(marriage.model.wAGE)["statereligion.full"]*ps_state_religion + fixef(marriage.model.wAGE)["stateunion.full"]*ps_state_union+ fixef(marriage.model.wAGE)["districtgay.full"]*ps_districtgay)


# Post-stratification
#cell.pred.weighted.MATRIX <- matrix(NA,length(cell.pred),N.sim) 
mrp.mean.gaymarriage.MATRIX <- matrix(NA,1942,N.sim)

for (w in 1:N.sim){
	proportion<-PS.Data.n[,w]
	cell.pred.weighted <- cell.pred * proportion
	mrp.mean.gaymarriage.MATRIX[,w]<-tapply(cell.pred.weighted, PS.Data1$ssd_district_uid, sum)
}

# MrP per State

ohio <-read.csv(paste("OH_GM_SSD.csv",sep=""))
mrp.ohio<-mrp.mean.gaymarriage.MATRIX[1389:1421,]

ca <-read.csv(paste("CA_GM_SSD.csv",sep=""))
mrp.ca<-mrp.mean.gaymarriage.MATRIX[121:160,]

wi<-read.csv(paste("WI_GM_SSD.csv",sep=""))
mrp.wi<-mrp.mean.gaymarriage.MATRIX[1880:1912,]

mi <-read.csv(paste("MI_GM_SSD.csv",sep=""))
mrp.mi<-mrp.mean.gaymarriage.MATRIX[815:852,]

az <-read.csv(paste("AZ_GM_SSD.csv",sep=""))
mrp.az<-mrp.mean.gaymarriage.MATRIX[56:85,]

ssd.mrp<-rbind(mrp.ohio, mrp.ca, mrp.wi, mrp.mi, mrp.az)
ssd.referendum<-append(ohio[,2], ca[,2])
ssd.referendum<-append(ssd.referendum, wi[,2])
ssd.referendum<-append(ssd.referendum, mi[,2])
ssd.referendum<-append(ssd.referendum, az[,2])


#########################################################################################################
#########################################################################################################

#Generate median, 0.025 lower bound, and 0.975 upper bound estimator
ssd.mrp.sorted <- t(apply(ssd.mrp,1,sort))
mrp.point.est <- rowMeans(ssd.mrp)

mean((ssd.referendum-rowMeans(ssd.mrp))^2)
mean((ssd.referendum-mrp.point.est)^2)
mean((ssd.referendum-ssd.mrp.sorted[,50])^2)

ssd.mrp.sorted.small <- ssd.mrp.sorted



##########   F U L L   U N C E R T A I N T Y   #############

n.simsL <- 100
model.SIM <- sim(marriage.model.wAGE, n.sims=n.simsL)

cell.pred.MATRIX <- matrix(NA,1165200,n.simsL)

for (g in 1:n.simsL){
	# fixing missing RE's for districts
	district.effects<-ranef(model.SIM)$district_num[g,,]
	district.effects.names<-as.numeric(names(district.effects))
	district.effects2<-cbind(district.effects, district.effects.names)
	district.effects3<-data.frame(matrix(NA, nrow=1942, ncol=1))
	# Create a vectors of district effects without any missing cells (zeros where no data for a district)
	for (i in 1:length(unique(district.uid))){
		j<-i
		district.effects3[i,1]<-sum(district.effects2[district.effects2[,2]==i,1], na.rm=TRUE)
	}

	# state effects
	state.effects<-ranef(model.SIM)$cst[g,,]
	state.effects.names<- names(state.effects)
	state.effects2<-cbind(state.effects, state.effects.names)
	state.effects3<-data.frame(matrix(NA, nrow=51, ncol=2))
	states<-unique(cst_ps)
	state.effects3[,2]<-states

	state.effects3[1:48,1]<-state.effects2[,1]
	states2<-factor(states)

	state.effects3[50,2]<-"AK"
	state.effects3[51,2]<-"HI"

	
	cell.pred.MATRIX[,g] <- invlogit(fixef(model.SIM)[g,1]+ ranef(model.SIM)$age.group[g,cat_age,1] + ranef(model.SIM)$education5[g,cat_educ,1] + ranef(model.SIM)$race2[g,cat_race,1] + fixef(model.SIM)[g,2]*cat_female + district.effects3[ps_district_uid,1] + as.numeric(state.effects3[cst_ps,1]) + ranef(model.SIM)$region[g,ps_cregion,1] + fixef(model.SIM)[g,3]*ps_district_income+ fixef(model.SIM)[g,4]*ps_district_urban + fixef(model.SIM)[g,5]*ps_district_veteran + fixef(model.SIM)[g,6]*ps_state_religion + fixef(model.SIM)[g,7]*ps_state_union + fixef(model.SIM)[g,8]*ps_districtgay )
}


output.mrp.UNCERTAINTY <- array(NA, c(1942,N.sim, n.simsL))
for (w in 1:N.sim){
	for (v in 1:n.simsL){
	proportion<-PS.Data.n[,w]
	cell.pred.weighted <- cell.pred.MATRIX[,v] * proportion
	output.mrp.UNCERTAINTY[,w,v]<-tapply(cell.pred.weighted, PS.Data1$ssd_district_uid, sum)
	}
}

output.mrp.UNCERTAINTY.flat <- matrix(NA, 1942,N.sim*n.simsL)
for (k in 1:1942){
	output.mrp.UNCERTAINTY.flat[k,] <- c(output.mrp.UNCERTAINTY[k,,])
}

#########################################################################################################

#Generate median, 0.025 lower bound, and 0.975 upper bound estimator
ssd.mrp.sorted <- matrix(NA, 1942,N.sim*n.simsL)
for (y in 1:1942){
  if (is.na(output.mrp.UNCERTAINTY.flat[y,1])!=TRUE){
    ssd.mrp.sorted[y,] <- sort(output.mrp.UNCERTAINTY.flat[y,])
  }
}


########################################################################################
########################################################################################

png("Figure07.png", width = 1800, height = 1000, pointsize=24)
#par(mfrow=c(1,2), family="CMU Serif")
par(mfrow=c(1,2))
colL <- "red"
plot(ssd.mrp.sorted.small[,50], ssd.referendum, xlab="", ylab="", axes=F, ylim=c(0,1),  xlim=c(0,1), pch=20, col="white", main="Uncertainty due to synthetic joint distribution", cex=0.5)
axis(1, at = c(0, .2, .4,.6,.8, 1),cex.axis=1.4, label=c("0%","20%", "40%",  "60%",  "80%",  "100%"), pos=0)
axis(2, at = c(0, .2, .4,.6,.8, 1),cex.axis=1.4, label=c("0%","20%", "40%",  "60%",  "80%",  "100%"), las = 1, tick = T, pos=0)
title(main="",xlab = "Predictions", ylab = "Vote Outcome", cex.lab=1.5)
clip(0, 1, 0, 1)
##abline(regression2)
segments(0, 0,1,1,col="grey", lty=2)
segments(ssd.mrp.sorted.small[,2], ssd.referendum,ssd.mrp.sorted.small[,98], ssd.referendum, col=colL, lwd=1, lty=1)

plot(ssd.mrp.sorted[c(1389:1421,121:160,1880:1912,815:852,56:85),5000], ssd.referendum, main="Full uncertainty (model and synthetic joint)", xlab="", ylab="", axes=F, ylim=c(0,1),  xlim=c(0,1), pch=20, col="white", cex=0.5)
axis(1, at = c(0, .2, .4,.6,.8, 1),cex.axis=1.4, label=c("0%","20%", "40%",  "60%",  "80%",  "100%"), pos=0)
axis(2, at = c(0, .2, .4,.6,.8, 1),cex.axis=1.4, label=c("0%","20%", "40%",  "60%",  "80%",  "100%"), las = 1, tick = T, pos=0)
title(main="",xlab = "Predictions", ylab = "Vote Outcome", cex.lab=1.5)
clip(0, 1, 0, 1)
##abline(regression2)
segments(0, 0,1,1,col="grey", lty=2 )
segments(ssd.mrp.sorted[c(1389:1421,121:160,1880:1912,815:852,56:85),250], ssd.referendum,ssd.mrp.sorted[c(1389:1421,121:160,1880:1912,815:852,56:85),9750], ssd.referendum, col=colL, lwd=1, lty=1)
dev.off()


mean((ssd.mrp.sorted.small[,98]-ssd.mrp.sorted.small[,2])/(ssd.mrp.sorted[c(1389:1421,121:160,1880:1912,815:852,56:85),9750] - ssd.mrp.sorted[c(1389:1421,121:160,1880:1912,815:852,56:85),250]),na.rm=TRUE)





